{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "frTMl3sShA3P"
   },
   "outputs": [],
   "source": [
    "from __future__ import absolute_import\n",
    "from __future__ import division\n",
    "from __future__ import print_function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "height": 321,
     "output_extras": [
      {
       "item_id": 1
      }
     ]
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 2880,
     "status": "error",
     "timestamp": 1505781339378,
     "user": {
      "displayName": "Sara Robinson",
      "photoUrl": "//lh4.googleusercontent.com/-RR9n0dvbwgI/AAAAAAAAAAI/AAAAAAAAMYM/SOr5ZExpvXE/s50-c-k-no/photo.jpg",
      "userId": "112510032804989247452"
     },
     "user_tz": 240
    },
    "id": "783h64rGhA3T",
    "outputId": "d447b2ab-e321-4ee5-abd4-de2c0116302f"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6\n",
      "  return f(*args, **kwds)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "You have TensorFlow version 1.4.0\n"
     ]
    }
   ],
   "source": [
    "import itertools\n",
    "import os\n",
    "\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "import tensorflow as tf\n",
    "from tensorflow import keras\n",
    "\n",
    "# This code was tested with TensorFlow v1.4\n",
    "print(\"You have TensorFlow version\", tf.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "c7te21f7hA3V"
   },
   "outputs": [],
   "source": [
    "# The CSV was generated from this query: https://bigquery.cloud.google.com/savedquery/513927984416:c494494324be4a80b1fc55f613abb39c\n",
    "# The data is also publicly available at this Cloud Storage URL: https://storage.googleapis.com/tensorflow-workshop-examples/stack-overflow-data.csv\n",
    "data = pd.read_csv(\"stack-overflow-data.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "output_extras": [
      {}
     ]
    },
    "colab_type": "code",
    "id": "chAbp3ryhA3X",
    "outputId": "3a16921d-a3be-4c43-a5e2-6012220ee13a"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>post</th>\n",
       "      <th>tags</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>what is causing this behavior  in our c# datet...</td>\n",
       "      <td>c#</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>have dynamic html load as if it was in an ifra...</td>\n",
       "      <td>asp.net</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>how to convert a float value in to min:sec  i ...</td>\n",
       "      <td>objective-c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>.net framework 4 redistributable  just wonderi...</td>\n",
       "      <td>.net</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>trying to calculate and print the mean and its...</td>\n",
       "      <td>python</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                post         tags\n",
       "0  what is causing this behavior  in our c# datet...           c#\n",
       "1  have dynamic html load as if it was in an ifra...      asp.net\n",
       "2  how to convert a float value in to min:sec  i ...  objective-c\n",
       "3  .net framework 4 redistributable  just wonderi...         .net\n",
       "4  trying to calculate and print the mean and its...       python"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "jquery           2000\n",
       "c++              2000\n",
       "javascript       2000\n",
       "ios              2000\n",
       "java             2000\n",
       "sql              2000\n",
       "html             2000\n",
       "angularjs        2000\n",
       "mysql            2000\n",
       "android          2000\n",
       "objective-c      2000\n",
       "c                2000\n",
       "asp.net          2000\n",
       "python           2000\n",
       "c#               2000\n",
       "ruby-on-rails    2000\n",
       "iphone           2000\n",
       "php              2000\n",
       ".net             2000\n",
       "css              2000\n",
       "Name: tags, dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Confirm that we have a balanced dataset\n",
    "# Note: data was randomly shuffled in our BigQuery query\n",
    "data['tags'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "output_extras": [
      {}
     ]
    },
    "colab_type": "code",
    "id": "h_SDal0khA3n",
    "outputId": "e6c311e5-c674-4cf2-f2dc-d6ceabfa6f83"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train size: 32000\n",
      "Test size: 8000\n"
     ]
    }
   ],
   "source": [
    "# Split data into train and test\n",
    "train_size = int(len(data) * .8)\n",
    "print (\"Train size: %d\" % train_size)\n",
    "print (\"Test size: %d\" % (len(data) - train_size))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "anD38iilhA3r"
   },
   "outputs": [],
   "source": [
    "train_posts = data['post'][:train_size]\n",
    "train_tags = data['tags'][:train_size]\n",
    "\n",
    "test_posts = data['post'][train_size:]\n",
    "test_tags = data['tags'][train_size:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "z4GblctFhA3u"
   },
   "outputs": [],
   "source": [
    "max_words = 1000\n",
    "tokenize = keras.preprocessing.text.Tokenizer(num_words=max_words, char_level=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "YatMLCKXhA3x"
   },
   "outputs": [],
   "source": [
    "tokenize.fit_on_texts(train_posts) # only fit on train\n",
    "x_train = tokenize.texts_to_matrix(train_posts).astype(np.float32) # model expects float32\n",
    "x_test = tokenize.texts_to_matrix(test_posts).astype(np.float32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "8quTsErLhA3z"
   },
   "outputs": [],
   "source": [
    "# Use sklearn utility to convert label strings to numbered index\n",
    "encoder = LabelEncoder()\n",
    "encoder.fit(train_tags)\n",
    "y_train = encoder.transform(train_tags)\n",
    "y_test = encoder.transform(test_tags)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "gtP-hDRZhA31"
   },
   "outputs": [],
   "source": [
    "# Converts the labels to a one-hot representation\n",
    "num_classes = np.max(y_train) + 1\n",
    "y_train = keras.utils.to_categorical(y_train, num_classes).astype(np.float32)\n",
    "y_test = keras.utils.to_categorical(y_test, num_classes).astype(np.float32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "output_extras": [
      {}
     ]
    },
    "colab_type": "code",
    "id": "XZFsdLYVhA33",
    "outputId": "882923f3-6705-46b5-be88-3d4fec2965f2"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "x_train shape: (32000, 1000)\n",
      "x_test shape: (8000, 1000)\n",
      "y_train shape: (32000, 20)\n",
      "y_test shape: (8000, 20)\n"
     ]
    }
   ],
   "source": [
    "# Inspect the dimenstions of our training and test data (this is helpful to debug)\n",
    "print('x_train shape:', x_train.shape)\n",
    "print('x_test shape:', x_test.shape)\n",
    "print('y_train shape:', y_train.shape)\n",
    "print('y_test shape:', y_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "cBIkzTOZhA36"
   },
   "outputs": [],
   "source": [
    "# This model trains very quickly and 2 epochs are already more than enough\n",
    "# Training for more epochs will likely lead to overfitting on this dataset\n",
    "# You can try tweaking these hyperparamaters when using this model with your own data\n",
    "batch_size = 32\n",
    "epochs = 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "output_extras": [
      {}
     ]
    },
    "colab_type": "code",
    "id": "XdrFuwx4hA39",
    "outputId": "4b002559-2f06-4681-8f02-2e76e62d7a57"
   },
   "outputs": [],
   "source": [
    "# Build the model\n",
    "model = keras.models.Sequential()\n",
    "model.add(keras.layers.Dense(512, input_shape=(max_words,), name=\"posts\"))\n",
    "model.add(keras.layers.Activation('relu'))\n",
    "model.add(keras.layers.Dropout(0.5))\n",
    "model.add(keras.layers.Dense(num_classes))\n",
    "model.add(keras.layers.Activation('softmax'))\n",
    "\n",
    "model.compile(loss='categorical_crossentropy',\n",
    "              optimizer='adam',\n",
    "              metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Using the Keras model from memory.\n",
      "INFO:tensorflow:Using default config.\n",
      "WARNING:tensorflow:Using temporary folder as model directory: /var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmp0x6nb2bs\n",
      "INFO:tensorflow:Using config: {'_model_dir': '/var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmp0x6nb2bs', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x10254de10>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}\n"
     ]
    }
   ],
   "source": [
    "# Convert Keras model to estimator\n",
    "estimator_model = keras.estimator.model_to_estimator(keras_model=model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define input fn\n",
    "def input_function(features, labels=None, shuffle=False):\n",
    "    input_fn = tf.estimator.inputs.numpy_input_fn(\n",
    "        x={\"posts_input\": features}, # See the accompanying blog post for explanation on naming\n",
    "        y=labels,\n",
    "        shuffle=shuffle\n",
    "    )\n",
    "    return input_fn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Create CheckpointSaverHook.\n",
      "INFO:tensorflow:Restoring parameters from /var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmp0x6nb2bs/\n",
      "INFO:tensorflow:Saving checkpoints for 1 into /var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmp0x6nb2bs/model.ckpt.\n",
      "INFO:tensorflow:loss = 3.0097, step = 1\n",
      "INFO:tensorflow:global_step/sec: 96.0292\n",
      "INFO:tensorflow:loss = 1.06048, step = 101 (1.044 sec)\n",
      "INFO:tensorflow:global_step/sec: 97.6674\n",
      "INFO:tensorflow:loss = 0.688308, step = 201 (1.022 sec)\n",
      "INFO:tensorflow:Saving checkpoints for 251 into /var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmp0x6nb2bs/model.ckpt.\n",
      "INFO:tensorflow:Loss for final step: 0.855072.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<tensorflow.python.estimator.estimator.Estimator at 0x119350748>"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "estimator_model.train(input_fn=input_function(x_train, y_train, shuffle=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "output_extras": [
      {}
     ]
    },
    "colab_type": "code",
    "id": "zjwBD8qFhA4D",
    "outputId": "0dda5da5-44c4-4fbc-f2ad-01d642ca1914",
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Starting evaluation at 2017-12-18-23:30:47\n",
      "INFO:tensorflow:Restoring parameters from /var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmp0x6nb2bs/model.ckpt-251\n",
      "INFO:tensorflow:Finished evaluation at 2017-12-18-23:30:48\n",
      "INFO:tensorflow:Saving dict for global step 251: accuracy = 0.813244, global_step = 251, loss = 0.644977\n",
      "{'accuracy': 0.81324404, 'loss': 0.64497727, 'global_step': 251}\n"
     ]
    }
   ],
   "source": [
    "# Evaluate the accuracy of our trained model\n",
    "score = estimator_model.evaluate(input_function(x_test, labels=y_test))\n",
    "print(score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "output_extras": [
      {}
     ]
    },
    "colab_type": "code",
    "id": "f000lYoxhA4F",
    "outputId": "21cd198f-1979-4b40-a2fd-891a1c0248db"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Restoring parameters from /var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmp0x6nb2bs/model.ckpt-251\n",
      "Actual label:jquery\n",
      "Predicted label: jquery\n",
      "\n",
      "Actual label:.net\n",
      "Predicted label: .net\n",
      "\n",
      "Actual label:android\n",
      "Predicted label: android\n",
      "\n",
      "Actual label:c++\n",
      "Predicted label: c++\n",
      "\n",
      "Actual label:.net\n",
      "Predicted label: c#\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Here's how to generate a prediction on individual examples\n",
    "text_labels = encoder.classes_ \n",
    "\n",
    "# We'll make predictions for the first five examples\n",
    "examples = x_test[:5]\n",
    "predictions = list(estimator_model.predict(input_function(examples)))\n",
    "\n",
    "# Print out the true and expected label\n",
    "for i in range(len(examples)):\n",
    "    prediction_array = list(predictions[i].values())[0]\n",
    "    predicted_label = text_labels[np.argmax(prediction_array)]\n",
    "    print('Actual label:' + test_tags.iloc[i])\n",
    "    print(\"Predicted label: \" + predicted_label + \"\\n\")  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": { },
 "nbformat": 4,
 "nbformat_minor": 1
}
